#installing the necessary packages if not pre-installed
#pip install pandas
#pip install plotly
#pip install seaborn
#pip install matplotlib
#pip install numpy
#!pip install nbformat
#!pip install chart_studio
#!pip install matplotlib-colorbar
#!pip install category_encoders
#importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.patches as patches
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
#importing the e-commerce dataset into a dataframe
ecd = pd.read_csv('C:\\Users\\rutvi\\OneDrive\\Desktop\\SEM 3\\DAB 303 Marketing Analytics\\E-Commerce Churn Data.csv')
ecd
| CustomerID | Churn | Tenure | PreferredLoginDevice | CityTier | WarehouseToHome | PreferredPaymentMode | Gender | HourSpendOnApp | NumberOfDeviceRegistered | PreferedOrderCat | SatisfactionScore | MaritalStatus | NumberOfAddress | Complain | OrderAmountHikeFromlastYear | CouponUsed | OrderCount | DaySinceLastOrder | CashbackAmount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50001 | 1 | 4.0 | Mobile Phone | 3 | 6.0 | Debit Card | Female | 3.0 | 3 | Laptop & Accessory | 2 | Single | 9 | 1 | 11.0 | 1.0 | 1.0 | 5.0 | 160 |
| 1 | 50002 | 1 | NaN | Phone | 1 | 8.0 | UPI | Male | 3.0 | 4 | Mobile | 3 | Single | 7 | 1 | 15.0 | 0.0 | 1.0 | 0.0 | 121 |
| 2 | 50003 | 1 | NaN | Phone | 1 | 30.0 | Debit Card | Male | 2.0 | 4 | Mobile | 3 | Single | 6 | 1 | 14.0 | 0.0 | 1.0 | 3.0 | 120 |
| 3 | 50004 | 1 | 0.0 | Phone | 3 | 15.0 | Debit Card | Male | 2.0 | 4 | Laptop & Accessory | 5 | Single | 8 | 0 | 23.0 | 0.0 | 1.0 | 3.0 | 134 |
| 4 | 50005 | 1 | 0.0 | Phone | 1 | 12.0 | CC | Male | NaN | 3 | Mobile | 5 | Single | 3 | 0 | 11.0 | 1.0 | 1.0 | 3.0 | 130 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5625 | 55626 | 0 | 10.0 | Computer | 1 | 30.0 | Credit Card | Male | 3.0 | 2 | Laptop & Accessory | 1 | Married | 6 | 0 | 18.0 | 1.0 | 2.0 | 4.0 | 151 |
| 5626 | 55627 | 0 | 13.0 | Mobile Phone | 1 | 13.0 | Credit Card | Male | 3.0 | 5 | Fashion | 5 | Married | 6 | 0 | 16.0 | 1.0 | 2.0 | NaN | 225 |
| 5627 | 55628 | 0 | 1.0 | Mobile Phone | 1 | 11.0 | Debit Card | Male | 3.0 | 2 | Laptop & Accessory | 4 | Married | 3 | 1 | 21.0 | 1.0 | 2.0 | 4.0 | 186 |
| 5628 | 55629 | 0 | 23.0 | Computer | 3 | 9.0 | Credit Card | Male | 4.0 | 5 | Laptop & Accessory | 4 | Married | 4 | 0 | 15.0 | 2.0 | 2.0 | 9.0 | 179 |
| 5629 | 55630 | 0 | 8.0 | Mobile Phone | 1 | 15.0 | Credit Card | Male | 3.0 | 2 | Laptop & Accessory | 3 | Married | 4 | 0 | 13.0 | 2.0 | 2.0 | 3.0 | 169 |
5630 rows × 20 columns
#checking the column names
ecd.columns
Index(['CustomerID', 'Churn', 'Tenure', 'PreferredLoginDevice', 'CityTier',
'WarehouseToHome', 'PreferredPaymentMode', 'Gender', 'HourSpendOnApp',
'NumberOfDeviceRegistered', 'PreferedOrderCat', 'SatisfactionScore',
'MaritalStatus', 'NumberOfAddress', 'Complain',
'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount',
'DaySinceLastOrder', 'CashbackAmount'],
dtype='object')
#checking the description of dataset
pd.options.display.float_format = '{:20,.2f}'.format
ecd.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CustomerID | 5,630.00 | 52,815.50 | 1,625.39 | 50,001.00 | 51,408.25 | 52,815.50 | 54,222.75 | 55,630.00 |
| Churn | 5,630.00 | 0.17 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| Tenure | 5,366.00 | 10.19 | 8.56 | 0.00 | 2.00 | 9.00 | 16.00 | 61.00 |
| CityTier | 5,630.00 | 1.65 | 0.92 | 1.00 | 1.00 | 1.00 | 3.00 | 3.00 |
| WarehouseToHome | 5,379.00 | 15.64 | 8.53 | 5.00 | 9.00 | 14.00 | 20.00 | 127.00 |
| HourSpendOnApp | 5,375.00 | 2.93 | 0.72 | 0.00 | 2.00 | 3.00 | 3.00 | 5.00 |
| NumberOfDeviceRegistered | 5,630.00 | 3.69 | 1.02 | 1.00 | 3.00 | 4.00 | 4.00 | 6.00 |
| SatisfactionScore | 5,630.00 | 3.07 | 1.38 | 1.00 | 2.00 | 3.00 | 4.00 | 5.00 |
| NumberOfAddress | 5,630.00 | 4.21 | 2.58 | 1.00 | 2.00 | 3.00 | 6.00 | 22.00 |
| Complain | 5,630.00 | 0.28 | 0.45 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 |
| OrderAmountHikeFromlastYear | 5,365.00 | 15.71 | 3.68 | 11.00 | 13.00 | 15.00 | 18.00 | 26.00 |
| CouponUsed | 5,374.00 | 1.75 | 1.89 | 0.00 | 1.00 | 1.00 | 2.00 | 16.00 |
| OrderCount | 5,372.00 | 3.01 | 2.94 | 1.00 | 1.00 | 2.00 | 3.00 | 16.00 |
| DaySinceLastOrder | 5,323.00 | 4.54 | 3.65 | 0.00 | 2.00 | 3.00 | 7.00 | 46.00 |
| CashbackAmount | 5,630.00 | 177.22 | 49.19 | 0.00 | 146.00 | 163.00 | 196.00 | 325.00 |
#checking the first 10 records of the dataset
ecd.head(10)
| CustomerID | Churn | Tenure | PreferredLoginDevice | CityTier | WarehouseToHome | PreferredPaymentMode | Gender | HourSpendOnApp | NumberOfDeviceRegistered | PreferedOrderCat | SatisfactionScore | MaritalStatus | NumberOfAddress | Complain | OrderAmountHikeFromlastYear | CouponUsed | OrderCount | DaySinceLastOrder | CashbackAmount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50001 | 1 | 4.00 | Mobile Phone | 3 | 6.00 | Debit Card | Female | 3.00 | 3 | Laptop & Accessory | 2 | Single | 9 | 1 | 11.00 | 1.00 | 1.00 | 5.00 | 160 |
| 1 | 50002 | 1 | NaN | Phone | 1 | 8.00 | UPI | Male | 3.00 | 4 | Mobile | 3 | Single | 7 | 1 | 15.00 | 0.00 | 1.00 | 0.00 | 121 |
| 2 | 50003 | 1 | NaN | Phone | 1 | 30.00 | Debit Card | Male | 2.00 | 4 | Mobile | 3 | Single | 6 | 1 | 14.00 | 0.00 | 1.00 | 3.00 | 120 |
| 3 | 50004 | 1 | 0.00 | Phone | 3 | 15.00 | Debit Card | Male | 2.00 | 4 | Laptop & Accessory | 5 | Single | 8 | 0 | 23.00 | 0.00 | 1.00 | 3.00 | 134 |
| 4 | 50005 | 1 | 0.00 | Phone | 1 | 12.00 | CC | Male | NaN | 3 | Mobile | 5 | Single | 3 | 0 | 11.00 | 1.00 | 1.00 | 3.00 | 130 |
| 5 | 50006 | 1 | 0.00 | Computer | 1 | 22.00 | Debit Card | Female | 3.00 | 5 | Mobile Phone | 5 | Single | 2 | 1 | 22.00 | 4.00 | 6.00 | 7.00 | 139 |
| 6 | 50007 | 1 | NaN | Phone | 3 | 11.00 | Cash on Delivery | Male | 2.00 | 3 | Laptop & Accessory | 2 | Divorced | 4 | 0 | 14.00 | 0.00 | 1.00 | 0.00 | 121 |
| 7 | 50008 | 1 | NaN | Phone | 1 | 6.00 | CC | Male | 3.00 | 3 | Mobile | 2 | Divorced | 3 | 1 | 16.00 | 2.00 | 2.00 | 0.00 | 123 |
| 8 | 50009 | 1 | 13.00 | Phone | 3 | 9.00 | E wallet | Male | NaN | 4 | Mobile | 3 | Divorced | 2 | 1 | 14.00 | 0.00 | 1.00 | 2.00 | 127 |
| 9 | 50010 | 1 | NaN | Phone | 1 | 31.00 | Debit Card | Male | 2.00 | 5 | Mobile | 3 | Single | 2 | 0 | 12.00 | 1.00 | 1.00 | 1.00 | 123 |
#checking the shape of our dataset
ecd.shape
(5630, 20)
#checking the dataset information
ecd.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5630 entries, 0 to 5629 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 5630 non-null int64 1 Churn 5630 non-null int64 2 Tenure 5366 non-null float64 3 PreferredLoginDevice 5630 non-null object 4 CityTier 5630 non-null int64 5 WarehouseToHome 5379 non-null float64 6 PreferredPaymentMode 5630 non-null object 7 Gender 5630 non-null object 8 HourSpendOnApp 5375 non-null float64 9 NumberOfDeviceRegistered 5630 non-null int64 10 PreferedOrderCat 5630 non-null object 11 SatisfactionScore 5630 non-null int64 12 MaritalStatus 5630 non-null object 13 NumberOfAddress 5630 non-null int64 14 Complain 5630 non-null int64 15 OrderAmountHikeFromlastYear 5365 non-null float64 16 CouponUsed 5374 non-null float64 17 OrderCount 5372 non-null float64 18 DaySinceLastOrder 5323 non-null float64 19 CashbackAmount 5630 non-null int64 dtypes: float64(7), int64(8), object(5) memory usage: 879.8+ KB
#Detectung missing values
ecd.isnull().any()
CustomerID False Churn False Tenure True PreferredLoginDevice False CityTier False WarehouseToHome True PreferredPaymentMode False Gender False HourSpendOnApp True NumberOfDeviceRegistered False PreferedOrderCat False SatisfactionScore False MaritalStatus False NumberOfAddress False Complain False OrderAmountHikeFromlastYear True CouponUsed True OrderCount True DaySinceLastOrder True CashbackAmount False dtype: bool
#counting the missing values
ecd.isna().sum()
CustomerID 0 Churn 0 Tenure 264 PreferredLoginDevice 0 CityTier 0 WarehouseToHome 251 PreferredPaymentMode 0 Gender 0 HourSpendOnApp 255 NumberOfDeviceRegistered 0 PreferedOrderCat 0 SatisfactionScore 0 MaritalStatus 0 NumberOfAddress 0 Complain 0 OrderAmountHikeFromlastYear 265 CouponUsed 256 OrderCount 258 DaySinceLastOrder 307 CashbackAmount 0 dtype: int64
# Plot missing values
ecd.isna().sum()[ecd.isna().sum() > 0].plot(kind='bar', color=plt.cm.Paired.colors)
plt.title("Missing Values")
plt.xticks(rotation=45)
plt.show()
#total number of missing values in our dataset
ecd.isna().sum().sum()
1856
#creating a function to display the data type, percentage of missing values and number of unique values per column
def sniff_modified(df):
with pd.option_context("display.max_colwidth", 20):
info = pd.DataFrame()
info['data type'] = df.dtypes
info['percent missing'] = df.isnull().sum()*100/len(df)
info['No. unique'] = df.apply(lambda x: len(x.unique()))
info['unique values'] = df.apply(lambda x: x.unique())
return info.sort_values('data type')
sniff_modified(ecd)
| data type | percent missing | No. unique | unique values | |
|---|---|---|---|---|
| CustomerID | int64 | 0.00 | 5630 | [50001, 50002, 50003, 50004, 50005, 50006, 500... |
| Complain | int64 | 0.00 | 2 | [1, 0] |
| NumberOfAddress | int64 | 0.00 | 15 | [9, 7, 6, 8, 3, 2, 4, 10, 1, 5, 19, 21, 11, 20... |
| SatisfactionScore | int64 | 0.00 | 5 | [2, 3, 5, 4, 1] |
| NumberOfDeviceRegistered | int64 | 0.00 | 6 | [3, 4, 5, 2, 1, 6] |
| Churn | int64 | 0.00 | 2 | [1, 0] |
| CityTier | int64 | 0.00 | 3 | [3, 1, 2] |
| CashbackAmount | int64 | 0.00 | 220 | [160, 121, 120, 134, 130, 139, 123, 127, 295, ... |
| WarehouseToHome | float64 | 4.46 | 35 | [6.0, 8.0, 30.0, 15.0, 12.0, 22.0, 11.0, 9.0, ... |
| HourSpendOnApp | float64 | 4.53 | 7 | [3.0, 2.0, nan, 1.0, 0.0, 4.0, 5.0] |
| DaySinceLastOrder | float64 | 5.45 | 23 | [5.0, 0.0, 3.0, 7.0, 2.0, 1.0, 8.0, 6.0, 4.0, ... |
| Tenure | float64 | 4.69 | 37 | [4.0, nan, 0.0, 13.0, 11.0, 9.0, 19.0, 20.0, 1... |
| OrderAmountHikeFromlastYear | float64 | 4.71 | 17 | [11.0, 15.0, 14.0, 23.0, 22.0, 16.0, 12.0, nan... |
| CouponUsed | float64 | 4.55 | 18 | [1.0, 0.0, 4.0, 2.0, 9.0, 6.0, 11.0, nan, 7.0,... |
| OrderCount | float64 | 4.58 | 17 | [1.0, 6.0, 2.0, 15.0, 4.0, 7.0, 3.0, 9.0, nan,... |
| PreferredPaymentMode | object | 0.00 | 7 | [Debit Card, UPI, CC, Cash on Delivery, E wall... |
| Gender | object | 0.00 | 2 | [Female, Male] |
| PreferedOrderCat | object | 0.00 | 6 | [Laptop & Accessory, Mobile, Mobile Phone, Oth... |
| PreferredLoginDevice | object | 0.00 | 3 | [Mobile Phone, Phone, Computer] |
| MaritalStatus | object | 0.00 | 3 | [Single, Divorced, Married] |
#creating a function to find the columns with missing values, extract the number and percentage of these missing values in relation to the dataset
def FindMissingColsPercentage(df):
total = 0
for col in df.columns:
missing_vals = df[col].isnull().sum()
# mean = sum / total
pct = df[col].isnull().mean() * 100
if missing_vals != 0:
print('{} => {} [{}%]'.format(col, df[col].isnull().sum(), round(pct, 2)))
total += missing_vals
if total == 0:
print("no missing values")
FindMissingColsPercentage(ecd)
Tenure => 264 [4.69%] WarehouseToHome => 251 [4.46%] HourSpendOnApp => 255 [4.53%] OrderAmountHikeFromlastYear => 265 [4.71%] CouponUsed => 256 [4.55%] OrderCount => 258 [4.58%] DaySinceLastOrder => 307 [5.45%]
#replacing all empty spaces with np.NaN
ecd_clean = ecd.replace(" ", np.NaN)
# replacing all missing values(NaN) in the dataset with 0
ecd_clean = ecd_clean.fillna(0)
#checking dataset information after replacing the missing values
ecd_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5630 entries, 0 to 5629 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 5630 non-null int64 1 Churn 5630 non-null int64 2 Tenure 5630 non-null float64 3 PreferredLoginDevice 5630 non-null object 4 CityTier 5630 non-null int64 5 WarehouseToHome 5630 non-null float64 6 PreferredPaymentMode 5630 non-null object 7 Gender 5630 non-null object 8 HourSpendOnApp 5630 non-null float64 9 NumberOfDeviceRegistered 5630 non-null int64 10 PreferedOrderCat 5630 non-null object 11 SatisfactionScore 5630 non-null int64 12 MaritalStatus 5630 non-null object 13 NumberOfAddress 5630 non-null int64 14 Complain 5630 non-null int64 15 OrderAmountHikeFromlastYear 5630 non-null float64 16 CouponUsed 5630 non-null float64 17 OrderCount 5630 non-null float64 18 DaySinceLastOrder 5630 non-null float64 19 CashbackAmount 5630 non-null int64 dtypes: float64(7), int64(8), object(5) memory usage: 879.8+ KB
ecd_clean.isnull().any()
CustomerID False Churn False Tenure False PreferredLoginDevice False CityTier False WarehouseToHome False PreferredPaymentMode False Gender False HourSpendOnApp False NumberOfDeviceRegistered False PreferedOrderCat False SatisfactionScore False MaritalStatus False NumberOfAddress False Complain False OrderAmountHikeFromlastYear False CouponUsed False OrderCount False DaySinceLastOrder False CashbackAmount False dtype: bool
FindMissingColsPercentage(ecd_clean)
no missing values
#viewing the object data type columns
ecd_clean[['PreferredPaymentMode', 'Gender', 'PreferedOrderCat', 'PreferredLoginDevice', 'MaritalStatus']]
| PreferredPaymentMode | Gender | PreferedOrderCat | PreferredLoginDevice | MaritalStatus | |
|---|---|---|---|---|---|
| 0 | Debit Card | Female | Laptop & Accessory | Mobile Phone | Single |
| 1 | UPI | Male | Mobile | Phone | Single |
| 2 | Debit Card | Male | Mobile | Phone | Single |
| 3 | Debit Card | Male | Laptop & Accessory | Phone | Single |
| 4 | CC | Male | Mobile | Phone | Single |
| ... | ... | ... | ... | ... | ... |
| 5625 | Credit Card | Male | Laptop & Accessory | Computer | Married |
| 5626 | Credit Card | Male | Fashion | Mobile Phone | Married |
| 5627 | Debit Card | Male | Laptop & Accessory | Mobile Phone | Married |
| 5628 | Credit Card | Male | Laptop & Accessory | Computer | Married |
| 5629 | Credit Card | Male | Laptop & Accessory | Mobile Phone | Married |
5630 rows × 5 columns
#checking the unique values in these columns
obj = ['PreferredPaymentMode', 'Gender', 'PreferedOrderCat', 'PreferredLoginDevice', 'MaritalStatus'] #creating a list to the column names
for i in obj: #creating a for loop to print out the column name and unique values and count
#print(i, ecd_clean[i].nunique(), '\n',ecd_clean[i].unique())
print(ecd_clean[i].value_counts(),'\n')
PreferredPaymentMode Debit Card 2314 Credit Card 1501 E wallet 614 UPI 414 COD 365 CC 273 Cash on Delivery 149 Name: count, dtype: int64 Gender Male 3384 Female 2246 Name: count, dtype: int64 PreferedOrderCat Laptop & Accessory 2050 Mobile Phone 1271 Fashion 826 Mobile 809 Grocery 410 Others 264 Name: count, dtype: int64 PreferredLoginDevice Mobile Phone 2765 Computer 1634 Phone 1231 Name: count, dtype: int64 MaritalStatus Married 2986 Single 1796 Divorced 848 Name: count, dtype: int64
Following a review of the unique values of these columns shown above, we can assume the following:
that 'CC' and 'COD' is the same as 'credit card' and 'Cash on Delivery' respectively under the PreferredPaymentMode column 'Mobile' Phone and 'Phone' connotes the same meaning for the records in the PreferredLoginDevice column. 'Mobile Phone' and 'Mobile' is the same for records in the PreferedOrderCat column. Based on these, we will proceed to further clean the data by replacing the following
'CC' with 'Credit Card' 'COD' with 'Cash on Delivery' 'Phone' with 'Mobile Phone', and 'Mobile' with 'Mobile Phone'.
#replacing the data entries that has the same meaning
ecd_clean['PreferredPaymentMode'] = ecd_clean['PreferredPaymentMode'].replace('CC', 'Credit Card')
ecd_clean['PreferredPaymentMode'] = ecd_clean['PreferredPaymentMode'].replace('COD', 'Cash on Delivery')
ecd_clean['PreferedOrderCat'] = ecd_clean['PreferedOrderCat'].replace('Mobile', 'Mobile Phone')
ecd_clean['PreferredLoginDevice'] = ecd_clean['PreferredLoginDevice'].replace('Phone', 'Mobile Phone')
#cross-checking the replacement has been effected
for i in obj: #creating a for loop to print out the column name and unique values and count
print(ecd_clean[i].value_counts(),'\n')
PreferredPaymentMode Debit Card 2314 Credit Card 1774 E wallet 614 Cash on Delivery 514 UPI 414 Name: count, dtype: int64 Gender Male 3384 Female 2246 Name: count, dtype: int64 PreferedOrderCat Mobile Phone 2080 Laptop & Accessory 2050 Fashion 826 Grocery 410 Others 264 Name: count, dtype: int64 PreferredLoginDevice Mobile Phone 3996 Computer 1634 Name: count, dtype: int64 MaritalStatus Married 2986 Single 1796 Divorced 848 Name: count, dtype: int64
#creating categories from tenure column into a new column - tenuregroup
ranges = [0,10,20,30,40,50,60,np.inf] #list to hold the bin ranges
group_names = ['0-10 years', '11-20 years', '21-30 years', '31-40 years', '41-50 years', '51-60 years', '61 years & above'] # list to hold the labels
ecd_clean['TenureGroup'] = pd.cut(ecd_clean['Tenure'], bins = ranges, labels = group_names, include_lowest = True)
ecd_clean[['Tenure', 'TenureGroup']]
| Tenure | TenureGroup | |
|---|---|---|
| 0 | 4.00 | 0-10 years |
| 1 | 0.00 | 0-10 years |
| 2 | 0.00 | 0-10 years |
| 3 | 0.00 | 0-10 years |
| 4 | 0.00 | 0-10 years |
| ... | ... | ... |
| 5625 | 10.00 | 0-10 years |
| 5626 | 13.00 | 11-20 years |
| 5627 | 1.00 | 0-10 years |
| 5628 | 23.00 | 21-30 years |
| 5629 | 8.00 | 0-10 years |
5630 rows × 2 columns
#checking the final clean data
ecd_clean.head(20)
| CustomerID | Churn | Tenure | PreferredLoginDevice | CityTier | WarehouseToHome | PreferredPaymentMode | Gender | HourSpendOnApp | NumberOfDeviceRegistered | ... | SatisfactionScore | MaritalStatus | NumberOfAddress | Complain | OrderAmountHikeFromlastYear | CouponUsed | OrderCount | DaySinceLastOrder | CashbackAmount | TenureGroup | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50001 | 1 | 4.00 | Mobile Phone | 3 | 6.00 | Debit Card | Female | 3.00 | 3 | ... | 2 | Single | 9 | 1 | 11.00 | 1.00 | 1.00 | 5.00 | 160 | 0-10 years |
| 1 | 50002 | 1 | 0.00 | Mobile Phone | 1 | 8.00 | UPI | Male | 3.00 | 4 | ... | 3 | Single | 7 | 1 | 15.00 | 0.00 | 1.00 | 0.00 | 121 | 0-10 years |
| 2 | 50003 | 1 | 0.00 | Mobile Phone | 1 | 30.00 | Debit Card | Male | 2.00 | 4 | ... | 3 | Single | 6 | 1 | 14.00 | 0.00 | 1.00 | 3.00 | 120 | 0-10 years |
| 3 | 50004 | 1 | 0.00 | Mobile Phone | 3 | 15.00 | Debit Card | Male | 2.00 | 4 | ... | 5 | Single | 8 | 0 | 23.00 | 0.00 | 1.00 | 3.00 | 134 | 0-10 years |
| 4 | 50005 | 1 | 0.00 | Mobile Phone | 1 | 12.00 | Credit Card | Male | 0.00 | 3 | ... | 5 | Single | 3 | 0 | 11.00 | 1.00 | 1.00 | 3.00 | 130 | 0-10 years |
| 5 | 50006 | 1 | 0.00 | Computer | 1 | 22.00 | Debit Card | Female | 3.00 | 5 | ... | 5 | Single | 2 | 1 | 22.00 | 4.00 | 6.00 | 7.00 | 139 | 0-10 years |
| 6 | 50007 | 1 | 0.00 | Mobile Phone | 3 | 11.00 | Cash on Delivery | Male | 2.00 | 3 | ... | 2 | Divorced | 4 | 0 | 14.00 | 0.00 | 1.00 | 0.00 | 121 | 0-10 years |
| 7 | 50008 | 1 | 0.00 | Mobile Phone | 1 | 6.00 | Credit Card | Male | 3.00 | 3 | ... | 2 | Divorced | 3 | 1 | 16.00 | 2.00 | 2.00 | 0.00 | 123 | 0-10 years |
| 8 | 50009 | 1 | 13.00 | Mobile Phone | 3 | 9.00 | E wallet | Male | 0.00 | 4 | ... | 3 | Divorced | 2 | 1 | 14.00 | 0.00 | 1.00 | 2.00 | 127 | 11-20 years |
| 9 | 50010 | 1 | 0.00 | Mobile Phone | 1 | 31.00 | Debit Card | Male | 2.00 | 5 | ... | 3 | Single | 2 | 0 | 12.00 | 1.00 | 1.00 | 1.00 | 123 | 0-10 years |
| 10 | 50011 | 1 | 4.00 | Mobile Phone | 1 | 18.00 | Cash on Delivery | Female | 2.00 | 3 | ... | 3 | Divorced | 2 | 0 | 0.00 | 9.00 | 15.00 | 8.00 | 295 | 0-10 years |
| 11 | 50012 | 1 | 11.00 | Mobile Phone | 1 | 6.00 | Debit Card | Male | 3.00 | 4 | ... | 3 | Single | 10 | 1 | 13.00 | 0.00 | 1.00 | 0.00 | 154 | 11-20 years |
| 12 | 50013 | 1 | 0.00 | Mobile Phone | 1 | 11.00 | Cash on Delivery | Male | 2.00 | 3 | ... | 3 | Single | 2 | 1 | 13.00 | 2.00 | 2.00 | 2.00 | 134 | 0-10 years |
| 13 | 50014 | 1 | 0.00 | Mobile Phone | 1 | 15.00 | Credit Card | Male | 3.00 | 4 | ... | 3 | Divorced | 1 | 1 | 17.00 | 0.00 | 1.00 | 0.00 | 134 | 0-10 years |
| 14 | 50015 | 1 | 9.00 | Mobile Phone | 3 | 15.00 | Credit Card | Male | 3.00 | 4 | ... | 2 | Single | 2 | 0 | 16.00 | 0.00 | 4.00 | 7.00 | 196 | 0-10 years |
| 15 | 50016 | 1 | 0.00 | Mobile Phone | 2 | 12.00 | UPI | Male | 3.00 | 3 | ... | 5 | Married | 5 | 1 | 22.00 | 1.00 | 1.00 | 2.00 | 121 | 0-10 years |
| 16 | 50017 | 1 | 0.00 | Computer | 1 | 12.00 | Debit Card | Female | 0.00 | 4 | ... | 2 | Single | 2 | 1 | 18.00 | 1.00 | 1.00 | 0.00 | 129 | 0-10 years |
| 17 | 50018 | 1 | 0.00 | Mobile Phone | 3 | 11.00 | E wallet | Male | 2.00 | 4 | ... | 3 | Single | 2 | 1 | 11.00 | 1.00 | 1.00 | 3.00 | 157 | 0-10 years |
| 18 | 50019 | 1 | 0.00 | Computer | 1 | 13.00 | Debit Card | Male | 3.00 | 5 | ... | 3 | Single | 2 | 1 | 24.00 | 1.00 | 1.00 | 6.00 | 161 | 0-10 years |
| 19 | 50020 | 1 | 19.00 | Mobile Phone | 1 | 20.00 | Debit Card | Female | 3.00 | 3 | ... | 4 | Divorced | 10 | 1 | 18.00 | 1.00 | 4.00 | 3.00 | 150 | 11-20 years |
20 rows × 21 columns
# Plotting graphs for better understanding of data distribution
cols = ['HourSpendOnApp', 'NumberOfDeviceRegistered', 'SatisfactionScore', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'DaySinceLastOrder']
fig, axes = plt.subplots(3, 2, figsize=(15, 15))
for col, ax in zip(cols, axes.flatten()):
ecd[col].value_counts().sort_index().plot(kind='line', ax=ax, title=col, color='purple')
ax.set(xlabel='Values', ylabel='Frequency')
plt.tight_layout()
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
ecd_1 = ecd_clean.copy() #creating a copy of the cleaned dataset
# Replace churn values
ecd_1['Churn'] = ecd_1['Churn'].replace({0: "Customer Retained", 1: "Customer Churned"})
# Create the bar plot
sns.barplot(x='Churn', y='HourSpendOnApp', data=ecd_1, palette='viridis')
plt.title("Churn Rate", fontsize=16)
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
# Count churned and retained customers
Cust_Churn_Count = (ecd_clean['Churn'] == 1).sum()
Cust_Retained_count = (ecd_clean['Churn'] == 0).sum()
# Define labels and sizes
labels = ['Customers Churned', 'Customers Retained']
sizes = [Cust_Churn_Count, Cust_Retained_count]
# New color scheme (shades of blue and purple)
colors_given = ['#4B0082', '#4682B4'] # Indigo & Steel Blue
# Define the explode parameter (pulling apart slices)
explode = (0.1, 0) # Pull apart 'Customers Churned' slice
fig, ax = plt.subplots()
# Create the pie chart
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90,
wedgeprops={'edgecolor': 'black'}, colors=colors_given, explode=explode)
ax.axis('equal') # Keep the chart as a circle
# Add legend
ax.legend(labels, loc="center left", bbox_to_anchor=(1.1, 1), edgecolor='black')
# Set title
plt.title('Customer Churn Analysis', fontweight='bold')
plt.show()
#visualising the percentage of customers that churned
ecd_clean['Churn'].value_counts().plot.pie(labels=['Customers who stayed', 'Customers who left'],
autopct='%1.1f%%',shadow=True, colors=sns.color_palette('Purples'))
plt.title("% of Customer's that churned")
plt.show()
#checking out churn rate by gender distribution using a pivot table
ecd_clean.pivot_table(values="Tenure", index="Gender", columns="Churn",
aggfunc='sum')
| Churn | 0 | 1 |
|---|---|---|
| Gender | ||
| Female | 22,011.00 | 946.00 |
| Male | 29,738.00 | 1,984.00 |
import seaborn as sns
import matplotlib.pyplot as plt
# Creating the histogram
plt.figure(figsize=(8, 5))
sns.histplot(data=ecd_1, x='Churn', hue='Gender', hue_order=['Male', 'Female'],
multiple='dodge', palette='coolwarm', shrink=0.8)
# Title and labels
plt.title("Churn Rate by Gender")
plt.xlabel("Churn Status")
plt.ylabel("Count")
plt.xticks(ticks=[0, 1], labels=['Customer Retained', 'Customer Churned'])
plt.show()
tenure_counts = ecd_clean['TenureGroup'].value_counts().reset_index()
tenure_counts.columns = ['TenureGroup', 'Count']
sns.barplot(x='TenureGroup', y='Count', data=tenure_counts, palette='coolwarm')
# Rotate x-axis labels for readability
plt.xticks(rotation=90)
plt.title("Customer Distribution by Tenure Group")
plt.xlabel("Tenure Group")
plt.ylabel("Count")
Text(0, 0.5, 'Count')
# Creating the bar plot
plt.figure(figsize=(10, 5))
sns.barplot(x='TenureGroup', y='Churn', data=ecd_clean, palette='coolwarm')
# Formatting the plot
plt.ylabel('Churn Rate')
plt.xlabel('Tenure Group')
plt.xticks(rotation=90)
plt.title("Churn Rate by Tenure Group")
plt.show()
sns.set_context("paper", font_scale=1.1)
# Create the histogram
plt.figure(figsize=(10, 5))
sns.histplot(ecd_clean[ecd_clean["Churn"] == 0]["DaySinceLastOrder"],
color="blue", label="Customers Retained", kde=False, bins=30, alpha=0.7)
sns.histplot(ecd_clean[ecd_clean["Churn"] == 1]["DaySinceLastOrder"],
color="red", label="Customers Churned", kde=False, bins=30, alpha=0.7)
# Formatting
plt.legend(loc='upper right')
plt.ylabel('Count')
plt.xlabel('Days Since Last Order')
plt.title('Churn Analysis based on Days Since Last Order')
plt.show()
# Define a custom color palette
custom_palette = ['#FF6347', '#4682B4', '#32CD32'] # Example colors: tomato, steelblue, limegreen
sns.lineplot(
data=ecd, x='OrderCount', y="Churn",
hue="CityTier", ci=None,
palette=custom_palette
)
<Axes: xlabel='OrderCount', ylabel='Churn'>
import seaborn as sns
import matplotlib.pyplot as plt
# Define a custom color palette for 'Churn' values
custom_palette = ['#FF6347', '#4682B4'] # Red for 'Customer Churned', Blue for 'Customer Retained'
order = ['Customer Retained', 'Customer Churned']
# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Plot with custom palette and bar chart
sns.countplot(data=ecd_1, x='Complain', hue='Churn', ax=axes[0], hue_order=order, palette=custom_palette)
sns.countplot(data=ecd_1, x='SatisfactionScore', hue='Churn', ax=axes[1], hue_order=order, palette=custom_palette)
plt.tight_layout()
plt.show()
# Define custom color palette
custom_palette = ['#FF6347', '#4682B4']
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Countplots with custom color palette
sns.countplot(data=ecd_1, x='CityTier', hue='Churn', ax=axes[0], hue_order=order, palette=custom_palette)
sns.countplot(data=ecd_1, x='PreferedOrderCat', hue='Churn', ax=axes[1],
order=['Laptop & Accessory', 'Mobile Phone', 'Fashion', 'Grocey', 'Others'],
hue_order=order, palette=custom_palette)
plt.tight_layout()
plt.show()
# Define custom color palette
custom_palette = ['#FF6347', '#4682B4']
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Side-by-side bar plots using histplot with dodge=True
sns.histplot(data=ecd_1, x='HourSpendOnApp', hue='Churn', ax=axes[0], hue_order=order,
multiple='dodge', palette=custom_palette, discrete=True)
sns.histplot(data=ecd_1, x='NumberOfDeviceRegistered', hue='Churn', ax=axes[1],
hue_order=order, multiple='dodge', palette=custom_palette, discrete=True)
plt.tight_layout()
plt.show()
#subsetting the dataset to determine the churn characteristics of customer that registered a complaint with the company
count_of_complaints=ecd_clean.groupby(['Complain','Churn']).apply(lambda x:x['Churn'].count()).reset_index(name='No. of Customers')
count_of_complaints
| Complain | Churn | No. of Customers | |
|---|---|---|---|
| 0 | 0 | 0 | 3586 |
| 1 | 0 | 1 | 440 |
| 2 | 1 | 0 | 1096 |
| 3 | 1 | 1 | 508 |
import matplotlib.pyplot as plt
# Data
complain_values = [0, 1]
churn_0 = [3586, 1096] # Customer Retained
churn_1 = [440, 508] # Customer Churned
complain_labels = ['No Complaint', 'Complaint']
bar_width = 0.30
# Positioning bars
r1 = range(len(complain_values))
r2 = [x + bar_width for x in r1]
# Create figure and axis
fig, ax = plt.subplots(figsize=(10, 6))
# Change bar colors
plt.bar(r1, churn_0, color='#3CB371', width=bar_width, edgecolor='black', label='Customers Stayed') # Green
plt.bar(r2, churn_1, color='#FF8C00', width=bar_width, edgecolor='black', label='Customers Left') # Orange
# Add value labels
for i in range(len(complain_values)):
plt.text(r1[i], churn_0[i] + 20, str(churn_0[i]), ha='center', va='bottom', color='black', fontweight='bold')
plt.text(r2[i], churn_1[i] + 50, str(churn_1[i]), ha='center', va='bottom', color='black', fontweight='bold')
# Labels & Titles
plt.xlabel('Complain', fontweight='bold')
plt.xticks([r + bar_width/2 for r in range(len(complain_values))], complain_labels)
plt.ylabel('No. of Customers', fontweight='bold')
plt.title('Complaint Counts of Customers in a Company', fontweight='bold')
# Legend
plt.legend()
# Show plot
plt.show()
# Import all the libraries for machine learning models
!pip install --upgrade scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,precision_score,recall_score,f1_score
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
Requirement already satisfied: scikit-learn in c:\users\rutvi\anaconda3\lib\site-packages (1.6.1) Requirement already satisfied: numpy>=1.19.5 in c:\users\rutvi\anaconda3\lib\site-packages (from scikit-learn) (1.26.4) Requirement already satisfied: scipy>=1.6.0 in c:\users\rutvi\anaconda3\lib\site-packages (from scikit-learn) (1.11.4) Requirement already satisfied: joblib>=1.2.0 in c:\users\rutvi\anaconda3\lib\site-packages (from scikit-learn) (1.2.0) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\rutvi\anaconda3\lib\site-packages (from scikit-learn) (3.5.0)
ecd_clean.nunique()
ecd_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5630 entries, 0 to 5629 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 5630 non-null int64 1 Churn 5630 non-null int64 2 Tenure 5630 non-null float64 3 PreferredLoginDevice 5630 non-null object 4 CityTier 5630 non-null int64 5 WarehouseToHome 5630 non-null float64 6 PreferredPaymentMode 5630 non-null object 7 Gender 5630 non-null object 8 HourSpendOnApp 5630 non-null float64 9 NumberOfDeviceRegistered 5630 non-null int64 10 PreferedOrderCat 5630 non-null object 11 SatisfactionScore 5630 non-null int64 12 MaritalStatus 5630 non-null object 13 NumberOfAddress 5630 non-null int64 14 Complain 5630 non-null int64 15 OrderAmountHikeFromlastYear 5630 non-null float64 16 CouponUsed 5630 non-null float64 17 OrderCount 5630 non-null float64 18 DaySinceLastOrder 5630 non-null float64 19 CashbackAmount 5630 non-null int64 20 TenureGroup 5630 non-null category dtypes: category(1), float64(7), int64(8), object(5) memory usage: 885.7+ KB
#converting the category dtype of TenureGroup column to numerical value
encoder = ce.OrdinalEncoder(mapping=[{'col': 'TenureGroup', 'mapping': {'0-10 years': 1, '11-20 years': 2, '21-30 years':3,
'31-40 years': 4, '41-50 years': 5, '51-60 years': 6,
'61 years & above': 7}}])
encoder.fit(ecd_clean)
ecd_clean = encoder.transform(ecd_clean)
ecd_clean.head(10)
| CustomerID | Churn | Tenure | PreferredLoginDevice | CityTier | WarehouseToHome | PreferredPaymentMode | Gender | HourSpendOnApp | NumberOfDeviceRegistered | ... | SatisfactionScore | MaritalStatus | NumberOfAddress | Complain | OrderAmountHikeFromlastYear | CouponUsed | OrderCount | DaySinceLastOrder | CashbackAmount | TenureGroup | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50001 | 1 | 4.00 | Mobile Phone | 3 | 6.00 | Debit Card | Female | 3.00 | 3 | ... | 2 | Single | 9 | 1 | 11.00 | 1.00 | 1.00 | 5.00 | 160 | 1 |
| 1 | 50002 | 1 | 0.00 | Mobile Phone | 1 | 8.00 | UPI | Male | 3.00 | 4 | ... | 3 | Single | 7 | 1 | 15.00 | 0.00 | 1.00 | 0.00 | 121 | 1 |
| 2 | 50003 | 1 | 0.00 | Mobile Phone | 1 | 30.00 | Debit Card | Male | 2.00 | 4 | ... | 3 | Single | 6 | 1 | 14.00 | 0.00 | 1.00 | 3.00 | 120 | 1 |
| 3 | 50004 | 1 | 0.00 | Mobile Phone | 3 | 15.00 | Debit Card | Male | 2.00 | 4 | ... | 5 | Single | 8 | 0 | 23.00 | 0.00 | 1.00 | 3.00 | 134 | 1 |
| 4 | 50005 | 1 | 0.00 | Mobile Phone | 1 | 12.00 | Credit Card | Male | 0.00 | 3 | ... | 5 | Single | 3 | 0 | 11.00 | 1.00 | 1.00 | 3.00 | 130 | 1 |
| 5 | 50006 | 1 | 0.00 | Computer | 1 | 22.00 | Debit Card | Female | 3.00 | 5 | ... | 5 | Single | 2 | 1 | 22.00 | 4.00 | 6.00 | 7.00 | 139 | 1 |
| 6 | 50007 | 1 | 0.00 | Mobile Phone | 3 | 11.00 | Cash on Delivery | Male | 2.00 | 3 | ... | 2 | Divorced | 4 | 0 | 14.00 | 0.00 | 1.00 | 0.00 | 121 | 1 |
| 7 | 50008 | 1 | 0.00 | Mobile Phone | 1 | 6.00 | Credit Card | Male | 3.00 | 3 | ... | 2 | Divorced | 3 | 1 | 16.00 | 2.00 | 2.00 | 0.00 | 123 | 1 |
| 8 | 50009 | 1 | 13.00 | Mobile Phone | 3 | 9.00 | E wallet | Male | 0.00 | 4 | ... | 3 | Divorced | 2 | 1 | 14.00 | 0.00 | 1.00 | 2.00 | 127 | 2 |
| 9 | 50010 | 1 | 0.00 | Mobile Phone | 1 | 31.00 | Debit Card | Male | 2.00 | 5 | ... | 3 | Single | 2 | 0 | 12.00 | 1.00 | 1.00 | 1.00 | 123 | 1 |
10 rows × 21 columns
#importing libraries for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
#defining coloumns (target column, needed for encoding later)
cust_id = ['CustomerID']
Target = ["Churn"]
#creating categorical list columns with unique values less than 7
cat_variable = ecd_clean.nunique()[ecd_clean.nunique() < 7].keys().tolist()
cat_variable = [x for x in cat_variable if x not in Target]
#creating numnerical list columns
numerical_vari = [x for x in ecd_clean if x not in cat_variable + Target + cust_id]
#separating columns with only 2 unique values as binary
binary_vari = ecd_clean.nunique()[ecd_clean.nunique() == 2].keys().tolist()
#separating rest into multiple
more_then_2_vari = [i for i in cat_variable if i not in binary_vari]
#label encoding binary columns
lab_encod = LabelEncoder()
for x in binary_vari:
ecd_clean[x] = lab_encod.fit_transform(ecd_clean[x])
#label encoding binary columns
lab_encod = LabelEncoder()
for x in binary_vari:
ecd_clean[x] = lab_encod.fit_transform(ecd_clean[x])
#summary
ecd_clean.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CustomerID | 5,630.00 | 52,815.50 | 1,625.39 | 50,001.00 | 51,408.25 | 52,815.50 | 54,222.75 | 55,630.00 |
| Churn | 5,630.00 | 0.17 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| Tenure | 5,630.00 | 9.71 | 8.63 | 0.00 | 1.00 | 8.00 | 15.00 | 61.00 |
| PreferredLoginDevice | 5,630.00 | 0.71 | 0.45 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 |
| CityTier | 5,630.00 | 1.65 | 0.92 | 1.00 | 1.00 | 1.00 | 3.00 | 3.00 |
| WarehouseToHome | 5,630.00 | 14.94 | 8.94 | 0.00 | 9.00 | 13.00 | 20.00 | 127.00 |
| Gender | 5,630.00 | 0.60 | 0.49 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 |
| HourSpendOnApp | 5,630.00 | 2.80 | 0.93 | 0.00 | 2.00 | 3.00 | 3.00 | 5.00 |
| NumberOfDeviceRegistered | 5,630.00 | 3.69 | 1.02 | 1.00 | 3.00 | 4.00 | 4.00 | 6.00 |
| SatisfactionScore | 5,630.00 | 3.07 | 1.38 | 1.00 | 2.00 | 3.00 | 4.00 | 5.00 |
| NumberOfAddress | 5,630.00 | 4.21 | 2.58 | 1.00 | 2.00 | 3.00 | 6.00 | 22.00 |
| Complain | 5,630.00 | 0.28 | 0.45 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 |
| OrderAmountHikeFromlastYear | 5,630.00 | 14.97 | 4.89 | 0.00 | 13.00 | 14.00 | 18.00 | 26.00 |
| CouponUsed | 5,630.00 | 1.67 | 1.89 | 0.00 | 1.00 | 1.00 | 2.00 | 16.00 |
| OrderCount | 5,630.00 | 2.87 | 2.94 | 0.00 | 1.00 | 2.00 | 3.00 | 16.00 |
| DaySinceLastOrder | 5,630.00 | 4.30 | 3.70 | 0.00 | 1.00 | 3.00 | 7.00 | 46.00 |
| CashbackAmount | 5,630.00 | 177.22 | 49.19 | 0.00 | 146.00 | 163.00 | 196.00 | 325.00 |
| TenureGroup | 5,630.00 | 1.55 | 0.75 | 1.00 | 1.00 | 1.00 | 2.00 | 7.00 |
import plotly.graph_objects as go
# Sample dataset columns with numeric variables
numeric_columns = [
"Churn", "Tenure", "CityTier", "WarehouseToHome", "HourSpendOnApp",
"NumberOfDeviceRegistered", "SatisfactionScore", "NumberOfAddress",
"Complain", "OrderAmountHikeFromlastYear", "CouponUsed", "OrderCount",
"DaySinceLastOrder", "CashbackAmount"
]
# Generating random data for correlation (Replace this with your actual DataFrame)
np.random.seed(42)
df = pd.DataFrame(np.random.rand(5630, len(numeric_columns)), columns=numeric_columns)
# Compute correlation matrix
corr_matrix = df.corr().values
# Create heatmap
trace = go.Heatmap(
z=corr_matrix,
x=numeric_columns,
y=numeric_columns,
colorscale="Viridis",
colorbar=dict(title="Pearson Correlation Coefficient", titleside="right")
)
# Layout settings
layout = go.Layout(
title="Correlation Matrix for Variables",
autosize=False,
height=720,
width=800,
margin=dict(r=0, l=210, t=25, b=210),
yaxis=dict(tickfont=dict(size=9)),
xaxis=dict(tickfont=dict(size=9))
)
# Plot figure
fig = go.Figure(data=[trace], layout=layout)
fig.show()
# Splitting dataset into train and test sets
train, test = train_test_split(ecd_clean, test_size=0.20, random_state=0)
# Define the columns excluding CustomerID and Target variables
cust_id = ["CustomerID"] # Assuming 'CustomerID' is the identifier
Target = ["Churn"] # Assuming 'Churn' is the target variable
cols = [col for col in ecd_clean.columns if col not in cust_id + Target]
# Splitting into features (X) and target (Y)
X_train = train[cols]
Y_train = train[Target]
X_test = test[cols]
Y_test = test[Target]
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Identify categorical columns
categorical_columns = ["PreferredPaymentMode", "Gender", "MaritalStatus", "PreferredLoginDevice", "PreferedOrderCat"]
# Encode categorical variables using Label Encoding
label_encoders = {}
for col in categorical_columns:
label_encoders[col] = LabelEncoder()
ecd_clean[col] = label_encoders[col].fit_transform(ecd_clean[col])
# Define Customer ID and Target variable
cust_id = ["CustomerID"] # Assuming 'CustomerID' is just an identifier
Target = ["Churn"] # Assuming 'Churn' is the target variable
# Select feature columns (excluding CustomerID and Target)
cols = [col for col in ecd_clean.columns if col not in cust_id + Target]
# Split dataset into training and testing sets
train, test = train_test_split(ecd_clean, test_size=0.20, random_state=0)
X_train = train[cols]
Y_train = train[Target]
X_test = test[cols]
Y_test = test[Target]
# Train Logistic Regression Model
logistic_regression_model = LogisticRegression(random_state=0)
logistic_regression_model.fit(X_train, Y_train)
# Make predictions
Y_pred = logistic_regression_model.predict(X_test)
# Evaluate Model Performance
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
# Print Metrics
print(f"\nModel Performance Metrics:")
print(f"---------------------------------")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}\n")
# Generate and Visualize the Confusion Matrix
conf_matrix = confusion_matrix(Y_test, Y_pred)
df_cm = pd.DataFrame(conf_matrix, index=["Actual: No Churn", "Actual: Churn"],
columns=["Predicted: No Churn", "Predicted: Churn"])
plt.figure(figsize=(8, 6))
sns.heatmap(df_cm, annot=True, fmt='d', cmap='coolwarm', linewidths=1, cbar=True)
plt.title("Confusion Matrix")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()
# Print Test Data Accuracy separately
print(f"Test Data Accuracy: {accuracy:.6f}")
Model Performance Metrics: --------------------------------- Accuracy: 0.8694 Precision: 0.6639 Recall: 0.4332 F1 Score: 0.5243
Test Data Accuracy: 0.869449
# Generate a classification report
classification_report_result = classification_report(Y_test, Y_pred)
print("Classification Report:")
print(classification_report_result)
Classification Report:
precision recall f1-score support
0 0.89 0.96 0.92 939
1 0.66 0.43 0.52 187
accuracy 0.87 1126
macro avg 0.78 0.69 0.72 1126
weighted avg 0.86 0.87 0.86 1126
Decision Tree Classifier and Random Forest Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Prepare data
X = ecd_clean.drop(['Churn'], axis=1)
y = ecd_clean['Churn']
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=128)
# Train Decision Tree Classifier
deci_tree = DecisionTreeClassifier(random_state=42)
deci_tree.fit(X_train, y_train)
# Make predictions
deci_tree_pred = deci_tree.predict(X_test)
# Calculate evaluation metrics
accuracy_dt = accuracy_score(y_test, deci_tree_pred)
precision_dt = precision_score(y_test, deci_tree_pred)
recall_dt = recall_score(y_test, deci_tree_pred)
# Print performance metrics
print("\nDecision Tree Model Performance:")
print("---------------------------------")
print(f"Accuracy: {accuracy_dt:.4f}")
print(f"Precision: {precision_dt:.4f}")
print(f"Recall: {recall_dt:.4f}\n")
# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, deci_tree_pred)
# Visualizing confusion matrix with "YlGnBu" colormap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="YlGnBu", linewidths=1, cbar=True)
# Labels and title
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix (Decision Tree)")
plt.show()
Decision Tree Model Performance: --------------------------------- Accuracy: 0.9440 Precision: 0.8208 Recall: 0.8744
#classification report for decision tree
class_report = classification_report(y_test, deci_tree_pred)
print("Classification Report for Decision Tree:\n", class_report)
Classification Report for Decision Tree:
precision recall f1-score support
0 0.97 0.96 0.97 927
1 0.82 0.87 0.85 199
accuracy 0.94 1126
macro avg 0.90 0.92 0.91 1126
weighted avg 0.95 0.94 0.94 1126
!pip install rfpimp
from rfpimp import *
Requirement already satisfied: rfpimp in c:\users\rutvi\anaconda3\lib\site-packages (1.3.7) Requirement already satisfied: numpy in c:\users\rutvi\anaconda3\lib\site-packages (from rfpimp) (1.26.4) Requirement already satisfied: pandas in c:\users\rutvi\anaconda3\lib\site-packages (from rfpimp) (2.1.4) Requirement already satisfied: scikit-learn in c:\users\rutvi\anaconda3\lib\site-packages (from rfpimp) (1.6.1) Requirement already satisfied: matplotlib in c:\users\rutvi\anaconda3\lib\site-packages (from rfpimp) (3.8.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (1.4.4) Requirement already satisfied: packaging>=20.0 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (23.1) Requirement already satisfied: pillow>=6.2.0 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (10.2.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\users\rutvi\anaconda3\lib\site-packages (from matplotlib->rfpimp) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\rutvi\anaconda3\lib\site-packages (from pandas->rfpimp) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in c:\users\rutvi\anaconda3\lib\site-packages (from pandas->rfpimp) (2023.3) Requirement already satisfied: scipy>=1.6.0 in c:\users\rutvi\anaconda3\lib\site-packages (from scikit-learn->rfpimp) (1.11.4) Requirement already satisfied: joblib>=1.2.0 in c:\users\rutvi\anaconda3\lib\site-packages (from scikit-learn->rfpimp) (1.2.0) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\rutvi\anaconda3\lib\site-packages (from scikit-learn->rfpimp) (3.5.0) Requirement already satisfied: six>=1.5 in c:\users\rutvi\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->rfpimp) (1.16.0)
import matplotlib.pyplot as plt
# Generating feature importance
feature_importance1 = deci_tree.feature_importances_
feature_names2 = X.columns
# Create a bar plot to visualize feature importance with customized colors
plt.figure(figsize=(10, 6))
plt.barh(feature_names2, feature_importance1, color='teal') # Change the color to 'teal' or any color you like
plt.xlabel('Feature Importance', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title('Decision Tree Feature Importance', fontsize=14)
# Add gridlines for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)
# Display the plot
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# Building our model using Random Forest Classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=324)
rand_fcl = RandomForestClassifier(n_estimators=120)
rand_fcl.fit(X_train, y_train)
rand_fcl_pred = rand_fcl.predict(X_test)
# Evaluate the model's performance
accuracy_rfc = accuracy_score(y_test, rand_fcl_pred)
precision_rfc = precision_score(y_test, rand_fcl_pred)
recall_rfc = recall_score(y_test, rand_fcl_pred)
print("Accuracy:", accuracy_rfc)
print("Precision:", precision_rfc)
print("Recall:", recall_rfc)
# Create a confusion matrix
conf_matrix = confusion_matrix(y_test, rand_fcl_pred)
plt.figure(figsize=(8, 6))
# Using a lighter color palette like 'Blues' for the heatmap
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt="d", cbar=False)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix Random Forest Classifier")
plt.show()
Accuracy: 0.9706927175843695 Precision: 0.9649122807017544 Recall: 0.859375
import matplotlib.pyplot as plt
# Generate feature importance
feature_importance2 = rand_fcl.feature_importances_
feature_names2 = X.columns
# Create a histogram-like bar plot to visualize feature importance with customized colors
plt.figure(figsize=(10, 6))
plt.barh(feature_names2, feature_importance2, color='lightcoral') # Light coral color for the bars
plt.xlabel('Feature Importance', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title('Random Forest Feature Importance', fontsize=14)
# Add gridlines for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)
# Display the plot
plt.show()